#1. Import the datasets and libraries, check datatype, statistical summary, shape, null values or incorrect imputation.
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from sklearn.linear_model import LogisticRegression
# importing ploting libraries
import matplotlib.pyplot as plt
# To enable plotting graphs in Jupyter notebook
%matplotlib inline
#importing seaborn for statistical plots
import seaborn as sns
#Let us break the X and y dataframes into training set and test set. For this we will use
#Sklearn package's data splitting function which is based on random function
from sklearn.model_selection import train_test_split
import numpy as np
import os,sys
from scipy import stats
# calculate accuracy measures and confusion matrix
from sklearn import metrics
df = pd.read_csv("Bank_Personal_Loan_Modelling.csv")
df.head()
#check datatype
df.info()
#shape
df.shape
#null values
df.isnull().sum()
#statistical summary
df.describe().transpose()
## Incorrect imputation check
# Age : is between 23 and 67 - All good
# Experience : is between -3 and 43 - Experince can't be negative - Incorrect imputation
# Income : is between 8 and 224 - All good
# Family : is between 1 and 4 - All good
# CCAvg : is between 0 and 10 - All good
#Experience is negative - which is incorrect - 52 employees have negative experince
df[~df['Experience'].gt(-1)]['Experience'].count()
#All records with negative Experience
df[~df['Experience'].gt(-1)]
#Experince is not gretaher than Age - All good
df[df['Experience'].gt(df['Age'])]
#Number of unique in each column
df.nunique()
#Number of people with zero mortgage
print('Number of people with zero mortgage :',df[~df['Mortgage'].gt(0)]['Mortgage'].count())
#Number of people with zero credit card spending per month
print('Number of people with zero credit card spending per month :',df[~df['CCAvg'].gt(0)]['CCAvg'].count())
#Value counts of all categorical columns
#Education is a categorical columns
df['Education'].value_counts()
# dropping ID as it will be 1-1 mapping anyways
cr_df = df.drop('ID', axis =1 ) # dropping this column as it will be 1-1 mapping anyways
cr_df.head()
#Univariate
cr_df.hist(stacked=False, bins=100, figsize=(12,30), layout=(14,2));
#Bivariate
cr_df.corr()
#Age and Expience have highest correlation - 0.99
sns.pairplot(cr_df);
import pandas_profiling
#Getting the pandas profiling report and check for incorrect imputation
pandas_profiling.ProfileReport(cr_df)
#Get data model ready
#Replacing the categorical variable with actual values
cr_df['Education'] = cr_df['Education'].replace({1: 'Undergrad', 2: 'Graduate', 3: 'Advanced/Professional'})
cr_df.head(6)
#Creating dummies
cr_df = pd.get_dummies(cr_df, columns=['Education'])
cr_df.head(6)
#Count of incorrect Experience (Negative Experience)
cr_df[~cr_df['Experience'].gt(-1)]['Experience'].count()
cr_df.describe().transpose()
# Make Experience as 0 when it is less than or equal to -1
for i in list(cr_df[~cr_df['Experience'].gt(-1)]['Experience'].index):
# # this would give row numbers of places where Experinece is less than or equal to -1
cr_df.loc[i,'Experience'] = 0
#Check for Negative Experience after change
cr_df[~cr_df['Experience'].gt(-1)]['Experience'].count()
#statistical summary again
cr_df.describe().transpose()
## Define X and Y variables
X = cr_df.drop('Personal Loan',axis=1) # Predictor feature columns (14 X m)
Y = cr_df['Personal Loan'] # Predicted class (1=True, 0=False) (1 X m)
##Split into training and test set
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1)
# 1 is just any random seed number
#Training Data
x_train.head()
#Testing Data
x_test.head()
print('Shape of Training Data', x_train.shape)
print('Shape of Testing Data', x_test.shape)
print("{0:0.2f}% data is in training set".format((len(x_train)/len(cr_df.index)) * 100))
print("{0:0.2f}% data is in test set".format((len(x_test)/len(cr_df.index)) * 100))
print("Original Personal Loan True Values : {0} ({1:0.2f}%)".format(len(cr_df.loc[cr_df['Personal Loan'] == 1]), (len(cr_df.loc[cr_df['Personal Loan'] == 1])/len(cr_df.index)) * 100))
print("Original Personal Loan False Values : {0} ({1:0.2f}%)".format(len(cr_df.loc[cr_df['Personal Loan'] == 0]), (len(cr_df.loc[cr_df['Personal Loan'] == 0])/len(cr_df.index)) * 100))
print("")
print("Training Personal Loan True Values : {0} ({1:0.2f}%)".format(len(y_train[y_train[:] == 1]), (len(y_train[y_train[:] == 1])/len(y_train)) * 100))
print("Training Personal Loan False Values : {0} ({1:0.2f}%)".format(len(y_train[y_train[:] == 0]), (len(y_train[y_train[:] == 0])/len(y_train)) * 100))
print("")
print("Test Personal Loan True Values : {0} ({1:0.2f}%)".format(len(y_test[y_test[:] == 1]), (len(y_test[y_test[:] == 1])/len(y_test)) * 100))
print("Test Personal Loan False Values : {0} ({1:0.2f}%)".format(len(y_test[y_test[:] == 0]), (len(y_test[y_test[:] == 0])/len(y_test)) * 100))
print("")
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, roc_auc_score,accuracy_score
# Fit the model on train
model = LogisticRegression(solver="liblinear")
model.fit(x_train, y_train)
#predict on test
y_predict = model.predict(x_test)
coef_df = pd.DataFrame(model.coef_)
coef_df['intercept'] = model.intercept_
print(coef_df)
#Evaluate Training model
Train_model_score = model.score(x_train, y_train)
print(Train_model_score)
#Evaluate Testing model
Test_model_score = model.score(x_test, y_test)
print(Test_model_score)
#Confusion Matrics
cm=metrics.confusion_matrix(y_test, y_predict, labels=[1, 0])
df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
columns = [i for i in ["Predict 1","Predict 0"]])
plt.figure(figsize = (7,5))
#Confusion Matrics - Heat map
sns.heatmap(df_cm, annot=True, fmt='.0f')
#Confusion Matrics
print(cm)
print("Recall:",recall_score(y_test,y_predict))
print()
print("Precision:",precision_score(y_test,y_predict))
print()
print("F1 Score:",f1_score(y_test,y_predict))
print()
print("Roc Auc Score:",roc_auc_score(y_test,y_predict))
#AUC ROC curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, model.predict(x_test))
fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(x_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
# Checking Parameters of logistic regression
model.get_params()
#If we dont specify the parameters in the model it takes default value
# Running a loop to check different values of 'solver'
# all solver can be used with l2, only 'liblinear' and 'saga' works with both 'l1' and 'l2'
train_score=[]
test_score=[]
solver = ['newton-cg','lbfgs','liblinear','sag','saga']
for i in solver:
model = LogisticRegression(random_state=1,penalty='l2', C = 0.75,solver=i) # changing values of solver
model.fit(x_train, y_train)
y_predict = model.predict(x_test)
train_score.append(round(model.score(x_train, y_train),3))
test_score.append(round(model.score(x_test, y_test),3))
print('Solver : ', solver)
print()
print('Training Score : ', train_score)
print()
print('Testing Score : ', test_score)
train_score=[]
test_score=[]
solver = ['liblinear','saga'] # changing values of solver which works with 'l1'
for i in solver:
model = LogisticRegression(random_state=1,penalty='l1', C = 0.75,solver=i) #changed penalty to 'l1'
model.fit(x_train, y_train)
y_predict = model.predict(x_test)
train_score.append(round(model.score(x_train, y_train),3))
test_score.append(round(model.score(x_test, y_test),3))
print('Solver : ', solver)
print()
print('Training Score : ', train_score)
print()
print('Testing Score : ', test_score)
## Highest accuracy is same 'l1' with 'liblinear' and 'l2' with 'newton-cg'
model = LogisticRegression(random_state=1,penalty='l1',solver='liblinear',class_weight='balanced')
model.fit(x_train, y_train)
y_predict = model.predict(x_test)
print("Training accuracy",model.score(x_train,y_train))
print()
print("Testing accuracy",model.score(x_test, y_test))
model = LogisticRegression(random_state=1,penalty='l1',solver='liblinear') # removing class weight to balanced
model.fit(x_train, y_train)
y_predict = model.predict(x_test)
print("Training accuracy",model.score(x_train,y_train))
print()
print("Testing accuracy",model.score(x_test, y_test))
## Testing accuracy increased and model is not overfit, removing class weight as balanced from the model
# Running a loop to check different values of 'C'
train_score=[]
test_score=[]
C = [0.01,0.1,0.2,0.25,0.5,0.75,1]
for i in C:
model = LogisticRegression(random_state=1,penalty='l1', solver='liblinear',class_weight='balanced', C=i) # changing values of C
model.fit(x_train, y_train)
y_predict = model.predict(x_test)
train_score.append(round(model.score(x_train,y_train),3)) # appending training accuracy in a blank list for every run of the loop
test_score.append(round(model.score(x_test, y_test),3)) # appending testing accuracy in a blank list for every run of the loop
print(C)
print()
print(train_score)
print()
print(test_score)
## Best testing accuracy is obtained for C=0.2
#Therefore final model is
model = LogisticRegression(random_state=1,penalty='l1', solver='liblinear', C=0.2)
model.fit(x_train, y_train)
y_predict = model.predict(x_test)
print("Trainig accuracy",model.score(x_train,y_train))
print()
print("Testing accuracy",model.score(x_test, y_test))
print()
#Confusion Matrics
cm=metrics.confusion_matrix(y_test, y_predict, labels=[1, 0])
df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
columns = [i for i in ["Predict 1","Predict 0"]])
plt.figure(figsize = (7,5))
#Confusion Matrics - Heat map
sns.heatmap(df_cm, annot=True, fmt='.0f')
print()
print("Recall:",recall_score(y_test,y_predict))
print()
print("Precision:",precision_score(y_test,y_predict))
print()
print("F1 Score:",f1_score(y_test,y_predict))
print()
print("Roc Auc Score:",roc_auc_score(y_test,y_predict))
# Additional
from yellowbrick.classifier import ClassificationReport, ROCAUC
# Visualize model performance with yellowbrick library
viz = ClassificationReport(model)
viz.fit(x_train, y_train)
viz.score(x_test, y_test)
viz.show()
roc = ROCAUC(model)
roc.fit(x_train, y_train)
roc.score(x_test, y_test)
roc.show()
print('The confusion matrix \n\n')
print('True Positives (TP): we correctly predicted that they will take personal loan ', cm[0,0],'\n')
print('True Negatives (TN): we correctly predicted that they don''t take personal loan ', cm[1,1],'\n')
print('False Positives (FP): we incorrectly predicted that they will take personal loan (a "Type I error") ',cm[0,1] ,' Falsely predict positive Type I error \n')
print('False Negatives (FN): we incorrectly predicted that they don''t take personal loan (a "Type II error") ',cm[1,0],' Falsely predict negative Type II error \n')